{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Execute this cell to install dependencies\n",
    "%pip install sf-hamilton[visualization]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "source": [
    "# Notebook showing how to run PDF summarizer on Spark [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dagworks-inc/hamilton/blob/main/examples/LLM_Workflows/pdf_summarizer/run_on_spark/run.ipynb) [![GitHub badge](https://img.shields.io/badge/github-view_source-2b3137?logo=github)](https://github.com/apache/hamilton/blob/main/examples/LLM_Workflows/pdf_summarizer/run_on_spark/run.ipynb)\n",
    "\n",
    "In this notebook we'll walk through what's in `run.py`, which shows how one\n",
    "can setup a spark job to run the PDF summarizer dataflow defined in  `summarization.py`.\n",
    "\n",
    "Note: if you're on a mac you might need to do the following in your environment as you start jupyter/this kernel:\n",
    "> OBJC_DISABLE_INITIALIZE_FORK_SAFETY=YES\n",
    "\n",
    "For your OPENAI_API_KEY you can put it in the enviornment as well, or modify this notebook directly."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:29:26.741436Z",
     "start_time": "2023-08-19T20:29:24.782064Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# imports\n",
    "import os\n",
    "import pandas as pd\n",
    "import summarization\n",
    "from pyspark.sql import SparkSession\n",
    "\n",
    "from hamilton import driver, log_setup\n",
    "from hamilton.plugins import h_spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:29:43.315228Z",
     "start_time": "2023-08-19T20:29:38.212840Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# more setup for spark, etc.\n",
    "openai_api_key = os.environ.get(\"OPENAI_API_KEY\")\n",
    "log_setup.setup_logging(log_level=log_setup.LOG_LEVELS[\"INFO\"])\n",
    "# create the SparkSession -- note in real life, you'd adjust the number of executors to control parallelism.\n",
    "spark = SparkSession.builder.config(\n",
    "    \"spark.executorEnv.OPENAI_API_KEY\", openai_api_key\n",
    "#).config( # you might need to following in case things don't work for you.\n",
    "#    \"spark.sql.warehouse.dir\", \"~/temp/dwh\"\n",
    "#).master(\n",
    "#    \"local[1]\"  # Change this in real life.\n",
    ").getOrCreate()\n",
    "spark.sparkContext.setLogLevel(\"info\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:29:43.320253Z",
     "start_time": "2023-08-19T20:29:43.317533Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# Set up specifics for this example\n",
    "openai_gpt_model = \"gpt-3.5-turbo-0613\"\n",
    "content_type = \"Scientific article\"\n",
    "user_query = \"Can you ELI5 the paper?\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "start_time": "2023-08-19T20:30:25.359581Z"
    },
    "collapsed": false,
    "is_executing": true,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# Create the input dataframe\n",
    "# replace this with SQL or however you'd get the data you need in.\n",
    "pandas_df = pd.DataFrame(\n",
    "    # TODO: update this to point to a PDF or two.\n",
    "    {\"pdf_source\": [\"a/path/to/a/PDF/CDMS2022-hamilton-paper.pdf\"]}\n",
    ")\n",
    "df = spark.createDataFrame(pandas_df)\n",
    "df.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:29:54.797884Z",
     "start_time": "2023-08-19T20:29:54.783823Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[WARNING] 2023-08-19 14:05:57,410 hamilton.telemetry(127): Note: Hamilton collects completely anonymous data about usage. This will help us improve Hamilton over time. See https://github.com/apache/hamilton#usage-analytics--data-privacy for details.\n"
     ]
    }
   ],
   "source": [
    "# Create the driver\n",
    "modules = [summarization]\n",
    "driver_config = dict(file_type=\"pdf\")\n",
    "# create the Hamilton driver\n",
    "adapter = h_spark.PySparkUDFGraphAdapter()\n",
    "dr = driver.Driver(driver_config, *modules, adapter=adapter)  # can pass in multiple modules\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:31:02.455114Z",
     "start_time": "2023-08-19T20:31:02.435225Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# create inputs to the UDFs - this needs to be column_name -> spark dataframe.\n",
    "execute_inputs = {col: df for col in df.columns}\n",
    "# add in any other scalar inputs/values/objects needed by the UDFs\n",
    "execute_inputs.update(\n",
    "    dict(\n",
    "        openai_gpt_model=openai_gpt_model,\n",
    "        content_type=content_type,\n",
    "        user_query=user_query,\n",
    "    )\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:31:04.846274Z",
     "start_time": "2023-08-19T20:31:04.841799Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# tell Hamilton what columns need to be appended to the dataframe.\n",
    "cols_to_append = [\n",
    "    \"raw_text\",\n",
    "    \"chunked_text\",\n",
    "    \"summarized_text\",\n",
    "]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:31:07.500434Z",
     "start_time": "2023-08-19T20:31:07.478062Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [
    {
     "data": {
      "image/svg+xml": [
       "<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n",
       "<!DOCTYPE svg PUBLIC \"-//W3C//DTD SVG 1.1//EN\"\n",
       " \"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd\">\n",
       "<!-- Generated by graphviz version 8.0.5 (20230430.1635)\n",
       " -->\n",
       "<!-- Pages: 1 -->\n",
       "<svg width=\"836pt\" height=\"404pt\"\n",
       " viewBox=\"0.00 0.00 835.69 404.00\" xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\">\n",
       "<g id=\"graph0\" class=\"graph\" transform=\"scale(1 1) rotate(0) translate(4 400)\">\n",
       "<polygon fill=\"white\" stroke=\"none\" points=\"-4,4 -4,-400 831.69,-400 831.69,4 -4,4\"/>\n",
       "<!-- content_type -->\n",
       "<g id=\"node1\" class=\"node\">\n",
       "<title>content_type</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"225.11\" cy=\"-306\" rx=\"83.08\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"225.11\" y=\"-300.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: content_type</text>\n",
       "</g>\n",
       "<!-- summarize_text_from_summaries_prompt -->\n",
       "<g id=\"node7\" class=\"node\">\n",
       "<title>summarize_text_from_summaries_prompt</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"171.11\" cy=\"-162\" rx=\"171.11\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"171.11\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">summarize_text_from_summaries_prompt</text>\n",
       "</g>\n",
       "<!-- content_type&#45;&gt;summarize_text_from_summaries_prompt -->\n",
       "<g id=\"edge13\" class=\"edge\">\n",
       "<title>content_type&#45;&gt;summarize_text_from_summaries_prompt</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M211.36,-287.79C204.05,-277.85 195.49,-264.81 190.11,-252 181.95,-232.52 177.13,-209.11 174.39,-191.19\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"177.72,-190.72 172.87,-181.3 170.78,-191.69 177.72,-190.72\"/>\n",
       "</g>\n",
       "<!-- summarize_chunk_of_text_prompt -->\n",
       "<g id=\"node11\" class=\"node\">\n",
       "<title>summarize_chunk_of_text_prompt</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"341.11\" cy=\"-234\" rx=\"141.94\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"341.11\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">summarize_chunk_of_text_prompt</text>\n",
       "</g>\n",
       "<!-- content_type&#45;&gt;summarize_chunk_of_text_prompt -->\n",
       "<g id=\"edge14\" class=\"edge\">\n",
       "<title>content_type&#45;&gt;summarize_chunk_of_text_prompt</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M252.31,-288.59C267.68,-279.31 287.11,-267.59 303.88,-257.47\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"305.12,-260.2 311.88,-252.04 301.51,-254.21 305.12,-260.2\"/>\n",
       "</g>\n",
       "<!-- raw_text -->\n",
       "<g id=\"node2\" class=\"node\">\n",
       "<title>raw_text</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"443.74,-324 380.49,-324 380.49,-288 443.74,-288 443.74,-324\"/>\n",
       "<text text-anchor=\"middle\" x=\"412.11\" y=\"-300.95\" font-family=\"Times,serif\" font-size=\"14.00\">raw_text</text>\n",
       "</g>\n",
       "<!-- chunked_text -->\n",
       "<g id=\"node4\" class=\"node\">\n",
       "<title>chunked_text</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"589.49,-252 500.74,-252 500.74,-216 589.49,-216 589.49,-252\"/>\n",
       "<text text-anchor=\"middle\" x=\"545.11\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">chunked_text</text>\n",
       "</g>\n",
       "<!-- raw_text&#45;&gt;chunked_text -->\n",
       "<g id=\"edge5\" class=\"edge\">\n",
       "<title>raw_text&#45;&gt;chunked_text</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M443.97,-288.23C461.51,-279 483.51,-267.42 502.51,-257.43\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"503.91,-260.12 511.13,-252.36 500.65,-253.92 503.91,-260.12\"/>\n",
       "</g>\n",
       "<!-- prompt_and_text_content -->\n",
       "<g id=\"node3\" class=\"node\">\n",
       "<title>prompt_and_text_content</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"524.11\" cy=\"-90\" rx=\"106.11\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"524.11\" y=\"-84.95\" font-family=\"Times,serif\" font-size=\"14.00\">prompt_and_text_content</text>\n",
       "</g>\n",
       "<!-- summarized_text -->\n",
       "<g id=\"node6\" class=\"node\">\n",
       "<title>summarized_text</title>\n",
       "<polygon fill=\"none\" stroke=\"black\" points=\"646.36,-36 535.86,-36 535.86,0 646.36,0 646.36,-36\"/>\n",
       "<text text-anchor=\"middle\" x=\"591.11\" y=\"-12.95\" font-family=\"Times,serif\" font-size=\"14.00\">summarized_text</text>\n",
       "</g>\n",
       "<!-- prompt_and_text_content&#45;&gt;summarized_text -->\n",
       "<g id=\"edge11\" class=\"edge\">\n",
       "<title>prompt_and_text_content&#45;&gt;summarized_text</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M540.33,-72.05C548.36,-63.67 558.21,-53.38 567.09,-44.1\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"569.19,-46.92 573.58,-37.28 564.14,-42.08 569.19,-46.92\"/>\n",
       "</g>\n",
       "<!-- summarized_chunks -->\n",
       "<g id=\"node5\" class=\"node\">\n",
       "<title>summarized_chunks</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"448.11\" cy=\"-162\" rx=\"87.69\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"448.11\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">summarized_chunks</text>\n",
       "</g>\n",
       "<!-- chunked_text&#45;&gt;summarized_chunks -->\n",
       "<g id=\"edge8\" class=\"edge\">\n",
       "<title>chunked_text&#45;&gt;summarized_chunks</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M521.14,-215.7C508.74,-206.75 493.44,-195.71 480.05,-186.05\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"482.31,-182.64 472.15,-179.62 478.21,-188.31 482.31,-182.64\"/>\n",
       "</g>\n",
       "<!-- summarized_chunks&#45;&gt;prompt_and_text_content -->\n",
       "<g id=\"edge2\" class=\"edge\">\n",
       "<title>summarized_chunks&#45;&gt;prompt_and_text_content</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M466.51,-144.05C475.87,-135.43 487.42,-124.8 497.71,-115.32\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"499.64,-118.38 504.62,-109.03 494.9,-113.23 499.64,-118.38\"/>\n",
       "</g>\n",
       "<!-- summarize_text_from_summaries_prompt&#45;&gt;prompt_and_text_content -->\n",
       "<g id=\"edge3\" class=\"edge\">\n",
       "<title>summarize_text_from_summaries_prompt&#45;&gt;prompt_and_text_content</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M249.02,-145.55C307.6,-133.93 387.53,-118.09 446.13,-106.46\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"446.69,-109.72 455.81,-104.35 445.32,-102.86 446.69,-109.72\"/>\n",
       "</g>\n",
       "<!-- openai_gpt_model -->\n",
       "<g id=\"node8\" class=\"node\">\n",
       "<title>openai_gpt_model</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"723.11\" cy=\"-234\" rx=\"104.58\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"723.11\" y=\"-228.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: openai_gpt_model</text>\n",
       "</g>\n",
       "<!-- openai_gpt_model&#45;&gt;summarized_chunks -->\n",
       "<g id=\"edge10\" class=\"edge\">\n",
       "<title>openai_gpt_model&#45;&gt;summarized_chunks</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M666.13,-218.5C621.33,-207.09 558.9,-191.2 512.33,-179.35\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"513.31,-175.73 502.75,-176.65 511.58,-182.51 513.31,-175.73\"/>\n",
       "</g>\n",
       "<!-- openai_gpt_model&#45;&gt;summarized_text -->\n",
       "<g id=\"edge12\" class=\"edge\">\n",
       "<title>openai_gpt_model&#45;&gt;summarized_text</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M724.92,-215.72C726.11,-197.11 725.85,-167.07 715.11,-144 695.42,-101.68 655.1,-65.37 625.77,-42.96\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"628.16,-39.63 618.06,-36.46 623.98,-45.24 628.16,-39.63\"/>\n",
       "</g>\n",
       "<!-- user_query -->\n",
       "<g id=\"node9\" class=\"node\">\n",
       "<title>user_query</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"630.11\" cy=\"-162\" rx=\"76.43\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"630.11\" y=\"-156.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: user_query</text>\n",
       "</g>\n",
       "<!-- user_query&#45;&gt;prompt_and_text_content -->\n",
       "<g id=\"edge4\" class=\"edge\">\n",
       "<title>user_query&#45;&gt;prompt_and_text_content</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M605.26,-144.59C591.38,-135.42 573.86,-123.85 558.66,-113.81\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"560.82,-110.39 550.55,-107.8 556.96,-116.23 560.82,-110.39\"/>\n",
       "</g>\n",
       "<!-- pdf_source -->\n",
       "<g id=\"node10\" class=\"node\">\n",
       "<title>pdf_source</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" stroke-dasharray=\"5,2\" cx=\"412.11\" cy=\"-378\" rx=\"76.43\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"412.11\" y=\"-372.95\" font-family=\"Times,serif\" font-size=\"14.00\">Input: pdf_source</text>\n",
       "</g>\n",
       "<!-- pdf_source&#45;&gt;raw_text -->\n",
       "<g id=\"edge1\" class=\"edge\">\n",
       "<title>pdf_source&#45;&gt;raw_text</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M412.11,-359.7C412.11,-352.24 412.11,-343.32 412.11,-334.97\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"415.61,-335.1 412.11,-325.1 408.61,-335.1 415.61,-335.1\"/>\n",
       "</g>\n",
       "<!-- summarize_chunk_of_text_prompt&#45;&gt;summarized_chunks -->\n",
       "<g id=\"edge9\" class=\"edge\">\n",
       "<title>summarize_chunk_of_text_prompt&#45;&gt;summarized_chunks</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M367.29,-215.88C381.23,-206.76 398.54,-195.43 413.57,-185.6\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"415.13,-188.11 421.58,-179.71 411.29,-182.25 415.13,-188.11\"/>\n",
       "</g>\n",
       "<!-- tokenizer_encoding -->\n",
       "<g id=\"node12\" class=\"node\">\n",
       "<title>tokenizer_encoding</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"545.11\" cy=\"-306\" rx=\"83.6\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"545.11\" y=\"-300.95\" font-family=\"Times,serif\" font-size=\"14.00\">tokenizer_encoding</text>\n",
       "</g>\n",
       "<!-- tokenizer_encoding&#45;&gt;chunked_text -->\n",
       "<g id=\"edge6\" class=\"edge\">\n",
       "<title>tokenizer_encoding&#45;&gt;chunked_text</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M545.11,-287.7C545.11,-280.24 545.11,-271.32 545.11,-262.97\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"548.61,-263.1 545.11,-253.1 541.61,-263.1 548.61,-263.1\"/>\n",
       "</g>\n",
       "<!-- max_token_length -->\n",
       "<g id=\"node13\" class=\"node\">\n",
       "<title>max_token_length</title>\n",
       "<ellipse fill=\"none\" stroke=\"black\" cx=\"727.11\" cy=\"-306\" rx=\"80.01\" ry=\"18\"/>\n",
       "<text text-anchor=\"middle\" x=\"727.11\" y=\"-300.95\" font-family=\"Times,serif\" font-size=\"14.00\">max_token_length</text>\n",
       "</g>\n",
       "<!-- max_token_length&#45;&gt;chunked_text -->\n",
       "<g id=\"edge7\" class=\"edge\">\n",
       "<title>max_token_length&#45;&gt;chunked_text</title>\n",
       "<path fill=\"none\" stroke=\"black\" d=\"M688.07,-289.98C662.4,-280.11 628.42,-267.04 600.04,-256.12\"/>\n",
       "<polygon fill=\"black\" stroke=\"black\" points=\"601.45,-252.53 590.86,-252.21 598.94,-259.07 601.45,-252.53\"/>\n",
       "</g>\n",
       "</g>\n",
       "</svg>\n"
      ],
      "text/plain": [
       "<graphviz.graphs.Digraph at 0x123e59a30>"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# visualize execution of what is going to be appended\n",
    "dr.visualize_execution(\n",
    "    cols_to_append, None, None, inputs=execute_inputs\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2023-08-19T20:25:26.570950Z",
     "start_time": "2023-08-19T20:25:26.563166Z"
    },
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "# tell Hamilton to tell Spark what to do\n",
    "df = dr.execute(cols_to_append, inputs=execute_inputs)\n",
    "df.explain()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": false,
    "jupyter": {
     "outputs_hidden": false
    }
   },
   "outputs": [],
   "source": [
    "df.show()\n",
    "# you can also save the dataframe as a json file, parquet, etc.\n",
    "# df.write.json(\"processed_pdfs\")\n",
    "spark.stop()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
